📥 Import Necessary Libraries¶

In [1]:
import numpy as np 
import httpx
from selectolax.parser import HTMLParser
from dataclasses import dataclass, asdict
import re
import csv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
sns.set_theme()

🌐 Data Collection and Web Scrapping¶

I collect football player data from https://www.transfermarkt.com/. Our data will consist of name of the player, his age, position, club and country of the club. Also, the number of matches he played and how many time he got subbed on or off. All of these variables are considered explanatory variables and the target variable is the player's market value in euros.¶
In [2]:
@dataclass
class Player:
    name: str
    position: str
    age : int
    club : str
    matches : int
    goals : int
    assists : int
    subOn : int
    subOff: int
    value : float
        
        
def parse_players(html):
    results=[]
    for x in {'odd', 'even'}:
    
        players= html.css("tr."+x)
    
        for player in players:
            new_data=Player(name= player.css_first("td.hauptlink").text(),
                            position= player.css("tr")[2].text(),
                            age= player.css("td.zentriert")[1].text(),
                            club= player.css("a")[2].attributes.get('title', ''),
                            matches= player.css("td.zentriert")[4].text(),
                            goals= player.css("td.zentriert")[5].text(),
                            assists= player.css("td.zentriert")[7].text(),
                            subOn= player.css("td.zentriert")[11].text(),
                            subOff= player.css("td.zentriert")[12].text(),
                            value= re.findall("\d+\.\d+", player.css_first("td.rechts").text())[0]

            )

            results.append(asdict(new_data))
    return(results)
  
def to_csv(results):
    with open("results.csv", "a", encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = ["name", "position","age", "club", "matches",\
                                                 "goals", "assists", "subOn", "subOff", "value"])
        writer.writerows(results)
        
def get_html(page):
    url = f"https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?land_id=0&ausrichtung=alle&spielerposition_id=alle&altersklasse=alle&jahrgang=0&kontinent_id=0&plus=1&page={page}"
    resp = httpx.get(url)
    return HTMLParser(resp.text)
In [3]:
@dataclass
class Team:
    club: str
    country: str
   
        
def parse_teams(html):
    results=[]
    for x in {'odd', 'even'}:
        teams= html.css("tr."+x)
    
        for team in teams:
            new_data=Team(club= team.css("a")[0].attributes.get('title', ''),
                            country=team.css("img")[1].attributes.get('title', '')
            )

            results.append(asdict(new_data))
    return(results)
  
def to_clubcsv(results):
    with open("clubs.csv", "a", encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = ["club", "country"])
        writer.writerows(results)
        
def get_club_html(page):
    url = f"https://www.transfermarkt.us/uefa/klubrangliste/statistik/stat/page/{page}/"
    resp = httpx.get(url)
    return HTMLParser(resp.text)
In [4]:
#Looping over the pages to collect data:
for page in range (1,10):
        html=get_html(page)
        res= parse_players(html)
        to_csv(res)
for page in range (1,5):
        html=get_club_html(page)
        res= parse_teams(html)
        to_clubcsv(res)

💾 Data Preparation & Cleaning¶

In [ ]:
 
In [5]:
#Read datafiles:
#Read player file that has players informations
df=pd.read_csv('results.csv', sep=',', names=["name", "position","age", "club", "matches", "goals", "assists", "subOn",\
                                              "subOff", "value"] )   
#Read club file that has clubs informations
clubs=pd.read_csv("clubs.csv", sep=',', names=["club", "country"] )
#Merging both files
data=df.merge(clubs, how='left', on='club')

data.sort_values("value",ascending=False, inplace=True)
data=data.reset_index(drop=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      225 non-null    object 
 1   position  225 non-null    object 
 2   age       225 non-null    int64  
 3   club      225 non-null    object 
 4   matches   225 non-null    int64  
 5   goals     225 non-null    int64  
 6   assists   225 non-null    int64  
 7   subOn     225 non-null    int64  
 8   subOff    225 non-null    int64  
 9   value     225 non-null    float64
 10  country   200 non-null    object 
dtypes: float64(1), int64(6), object(4)
memory usage: 19.5+ KB
Some of the countries are missing, let's fix that!¶
In [6]:
data[data.country.isna()]
Out[6]:
name position age club matches goals assists subOn subOff value country
63 Bruno Guimarães Defensive Midfield 25 Newcastle United 35 4 5 3 11 60.0 NaN
73 Moisés Caicedo Defensive Midfield 21 Brighton & Hove Albion 40 1 1 2 5 55.0 NaN
84 Ivan Toney Centre-Forward 27 Brentford FC 35 21 5 2 4 50.0 NaN
87 Alexander Isak Centre-Forward 23 Newcastle United 29 11 3 9 17 50.0 NaN
113 Sven Botman Centre-Back 23 Newcastle United 39 0 0 3 2 45.0 NaN
116 Amadou Onana Defensive Midfield 21 Everton FC 34 1 1 3 13 42.0 NaN
117 Alexis Mac Allister Central Midfield 24 Brighton & Hove Albion 39 11 3 4 9 42.0 NaN
142 Anthony Gordon Left Winger 22 Newcastle United 31 3 1 14 11 40.0 NaN
147 James Ward-Prowse Central Midfield 28 Southampton FC 42 9 3 1 1 38.0 NaN
150 Joelinton Attacking Midfield 26 Newcastle United 37 8 3 3 3 38.0 NaN
154 Miguel Almirón Right Winger 29 Newcastle United 39 11 3 6 26 35.0 NaN
156 João Palhinha Defensive Midfield 27 Fulham FC 41 4 1 4 8 35.0 NaN
165 Allan Saint-Maximin Left Winger 26 Newcastle United 26 1 5 13 10 35.0 NaN
181 Marc Guéhi Centre-Back 22 Crystal Palace 38 1 0 1 1 35.0 NaN
188 Douglas Luiz Central Midfield 24 Aston Villa 36 5 5 4 8 35.0 NaN
195 Joachim Andersen Centre-Back 26 Crystal Palace 35 1 0 0 3 32.0 NaN
197 Ollie Watkins Centre-Forward 27 Aston Villa 36 15 6 2 9 32.0 NaN
201 Jacob Ramsey Central Midfield 21 Aston Villa 38 5 6 8 18 32.0 NaN
203 Robert Sánchez Goalkeeper 25 Brighton & Hove Albion 26 0 0 0 1 32.0 NaN
207 Brennan Johnson Attacking Midfield 21 Nottingham Forest 45 12 3 6 14 30.0 NaN
208 Eberechi Eze Attacking Midfield 24 Crystal Palace 36 8 3 10 15 30.0 NaN
215 Oihan Sancet Attacking Midfield 23 Athletic Bilbao 37 8 2 4 28 30.0 NaN
216 Gabri Veiga Attacking Midfield 20 Celta de Vigo 36 9 4 10 22 30.0 NaN
218 Morgan Gibbs-White Attacking Midfield 23 Nottingham Forest 38 5 7 3 9 30.0 NaN
222 Cheick Doucouré Defensive Midfield 23 Crystal Palace 33 0 3 0 18 30.0 NaN
All missing values are for clubs in England except two rows (210 and 213) which are Spanish clubs¶
In [7]:
#Filling missing data
data.loc[210,'country']='Spain'
data.loc[213,'country']='Spain'
data.fillna('England', inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      225 non-null    object 
 1   position  225 non-null    object 
 2   age       225 non-null    int64  
 3   club      225 non-null    object 
 4   matches   225 non-null    int64  
 5   goals     225 non-null    int64  
 6   assists   225 non-null    int64  
 7   subOn     225 non-null    int64  
 8   subOff    225 non-null    int64  
 9   value     225 non-null    float64
 10  country   225 non-null    object 
dtypes: float64(1), int64(6), object(4)
memory usage: 19.5+ KB
Since we have limited data, we will focus on goals and assits. Therefore, we will analyze only attackers.¶
In [8]:
#All position in our data:
set(data['position'])
Out[8]:
{'Attacking Midfield',
 'Central Midfield',
 'Centre-Back',
 'Centre-Forward',
 'Defensive Midfield',
 'Goalkeeper',
 'Left Winger',
 'Left-Back',
 'Right Winger',
 'Right-Back',
 'Second Striker'}
In [9]:
#We take only attackers
data=data[data['position'].isin(['Attacking Midfield',
'Centre-Forward',
'Left Winger',
'Right Winger',
'Second Striker'])]
In [10]:
#We create a new variable: goal Contribution Ratio, which is a measure the contribution of a player to their team's goals.
# It is calculated by dividing the number of goals and assists a player has by the total number of games played.
data['goalContributionRatio']=(data['goals']+data['assists'])/data['matches']

📊 Exploratory Data Aanalysis¶

In [11]:
data.head(5)
Out[11]:
name position age club matches goals assists subOn subOff value country goalContributionRatio
0 Kylian Mbappé Centre-Forward 24 Paris Saint-Germain 45 39 10 4 10 180.0 France 1.088889
1 Erling Haaland Centre-Forward 22 Manchester City 51 57 9 2 23 170.0 England 1.294118
2 Vinicius Junior Left Winger 22 Real Madrid 52 22 20 2 18 120.0 Spain 0.807692
4 Bukayo Saka Right Winger 21 Arsenal FC 52 15 13 9 20 110.0 England 0.538462
5 Jamal Musiala Attacking Midfield 20 Bayern Munich 49 15 14 13 30 110.0 Germany 0.591837
In [12]:
#Visualizing categorical data
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
plt1=sns.countplot(data=data, y='country', order = data['country'].value_counts().index)
plt.title('Number of Players playing at each league',fontsize=24)
plt.subplot(1,2,2)
plt2=plt.pie(data['club'].value_counts(), labels=data['club'].value_counts().index, autopct='%.0f%%')
plt.title('Clubs of top valued players',fontsize=24)
plt.tight_layout()

plt.show()
In [13]:
#Make it beautiful and add titles
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
plt1=sns.boxplot(x='position', y='value', data=data)
plt.title('Distribution of players value by position',fontsize=24)
plt.subplot(1,2,2)
plt2=sns.boxplot(x='country', y='value', data=data)
plt.title('Distribution of players value by league',fontsize=24)
plt.tight_layout()
plt.show()
We drop Club and Position a sthey seem to have little effect on the value¶
In [14]:
data.drop(['position','club'], axis=1, inplace=True)
In [15]:
#Visualize correlation matrix
df=data.copy()
labelencoder = LabelEncoder()
df['country'] = labelencoder.fit_transform(df['country'])
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, linewidths=.1,  cmap="RdBu");
In [16]:
sns.pairplot(df);

📈 Linear Regression¶

In [17]:
y=df.value
X=df[['matches','age','country','goalContributionRatio']]
In [18]:
X=df.drop(['name','value'], axis=1)
In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline = Pipeline([('std_scalar', StandardScaler())])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
reg = LinearRegression()
reg.fit(X_train, y_train)
Out[19]:
LinearRegression()
In [20]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('--------------------------------')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square
test_pred = reg.predict(X_test)
train_pred = reg.predict(X_train)
print('Model Evaluation:\n--------------------------------')
print_evaluate(y_test, test_pred)
Model Evaluation:
--------------------------------
MAE: 18.869821333901264
MSE: 504.7084958543172
RMSE: 22.465718235888147
R2 Square 0.4997574197805914
--------------------------------
In [21]:
sns.regplot(x=y_test, y=test_pred,line_kws={'lw':1,'color': '#FF4500' ,'linestyle':'-.'}, marker="o");
In [22]:
sns.residplot(y_test,test_pred);
C:\Users\youss\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [23]:
sns.displot(y_test-test_pred, kde=True);